Creating new columns converted from the measure per 100 g to the measure per serving.
In [2]:
import pandas as pd
In [3]:
#Load dataset
data = pd.read_csv("demo_food_data.csv", sep = ",")
In [4]:
for c in data.columns:
print(c)
In [14]:
# Take a quick look
data.head()
Out[14]:
Now going to go through and find items that have certain category words in the product name. Then filter these to exclude the most often word that is confused in there (e.g. donut flavor coffee gets picked up under donut).
Then going to sort each of these based on the rank of items on key factors like sugar. And for each factor, going to pick items that are at specified percentiles, so we get a wide range on those factors.
In [5]:
#Define the columns we want to convert
original_names = ['energy_100g', 'fat_100g', 'cholesterol_100g','carbohydrates_100g','sugars_100g','starch_100g','fiber_100g',
'proteins_100g','salt_100g','sodium_100g','alcohol_100g','folates_100g','bicarbonate_100g',
'potassium_100g','chloride_100g','calcium_100g','iron_100g','fluoride_100g','iodine_100g','caffeine_100g','cocoa_100g']
#Create new variable names
new_names=[]
for i in range(len(original_names)):
new_names.append(original_names[i].split("_")[0])
#Parse the serving size and calculate the factor to convert 100 g to serving g
f = lambda x: (x["serving_size"].split(" g")[0])
data["serv_size"] = data.apply(f, axis=1)
f = lambda x: (float(x["serv_size"])/100)
data["serv_factor"] = data.apply(f, axis=1)
#For each target column, create a new column and calculate measure per serving
f = lambda x: (float(x["energy_100g"])*float(x["serv_factor"]))
data["energy"] = data.apply(f, axis=1)
f = lambda x: (float(x["fat_100g"])*float(x["serv_factor"]))
data["fat"] = data.apply(f, axis=1)
f = lambda x: (float(x["cholesterol_100g"])*float(x["serv_factor"]))
data["cholesterol"] = data.apply(f, axis=1)
f = lambda x: (float(x["carbohydrates_100g"])*float(x["serv_factor"]))
data["carbohydrates"] = data.apply(f, axis=1)
f = lambda x: (float(x["sugars_100g"])*float(x["serv_factor"]))
data["sugars"] = data.apply(f, axis=1)
f = lambda x: (float(x["starch_100g"])*float(x["serv_factor"]))
data["starch"] = data.apply(f, axis=1)
f = lambda x: (float(x["fiber_100g"])*float(x["serv_factor"]))
data["fiber"] = data.apply(f, axis=1)
f = lambda x: (float(x["proteins_100g"])*float(x["serv_factor"]))
data["proteins"] = data.apply(f, axis=1)
f = lambda x: (float(x["salt_100g"])*float(x["serv_factor"]))
data["salt"] = data.apply(f, axis=1)
f = lambda x: (float(x["sodium_100g"])*float(x["serv_factor"]))
data["sodium"] = data.apply(f, axis=1)
f = lambda x: (float(x["alcohol_100g"])*float(x["serv_factor"]))
data["alcohol"] = data.apply(f, axis=1)
f = lambda x: (float(x["folates_100g"])*float(x["serv_factor"]))
data["folates"] = data.apply(f, axis=1)
f = lambda x: (float(x["bicarbonate_100g"])*float(x["serv_factor"]))
data["bicarbonate"] = data.apply(f, axis=1)
f = lambda x: (float(x["potassium_100g"])*float(x["serv_factor"]))
data["potassium"] = data.apply(f, axis=1)
f = lambda x: (float(x["chloride_100g"])*float(x["serv_factor"]))
data["chloride"] = data.apply(f, axis=1)
f = lambda x: (float(x["calcium_100g"])*float(x["serv_factor"]))
data["calcium"] = data.apply(f, axis=1)
f = lambda x: (float(x["iron_100g"])*float(x["serv_factor"]))
data["iron"] = data.apply(f, axis=1)
f = lambda x: (float(x["fluoride_100g"])*float(x["serv_factor"]))
data["fluoride"] = data.apply(f, axis=1)
f = lambda x: (float(x["iodine_100g"])*float(x["serv_factor"]))
data["iodine"] = data.apply(f, axis=1)
f = lambda x: (float(x["caffeine_100g"])*float(x["serv_factor"]))
data["caffeine"] = data.apply(f, axis=1)
f = lambda x: (float(x["cocoa_100g"])*float(x["serv_factor"]))
data["cocoa"] = data.apply(f, axis=1)
In [6]:
# Take a look at what we built
data.head()
Out[6]:
In [7]:
# Now write it out to disk
outfile = "demo_food_data_sd_regularized_columns.csv"
data.to_csv(outfile)